library("readr")
library("igraph")
library("dplyr")
library("stringr")
library("scales")
library("textreuse")
source("R/helper.R")
source("R/section-matches.R")
Read the data.
load("cache/corpus-lsh.rda")
source("R/spectrogram.R")
Spectrogram of borrowings in CA1850. I expect it will show heavily reliance on NY1849 amidst original material.
spectrogram("CA1850", best_matches, white_list = 7)
## Loading required package: ggplot2
## Selecting by n
Spectrogram of borrowings in MD1855. I expect to see reliance on English legislation (GB1852 and GB1854), amidst original material.
spectrogram("MD1855", best_matches, white_list = 7)
## Selecting by n
Spectrogram of borrowings in WA1855. An earlier version show two long runs of borrowings from Oregon and Indiana. If the latest data, still visualizes that kind of splicing, I think it’s a great illustration.
spectrogram("WA1855", best_matches, white_list = 8)
## Selecting by n
The point of this and the next exercise is not to show borrowings from all sources, but similarity between two sources. Our normal spectrogram of NV1861 will show heavy reliance on California. I want this to show that even though Nevada is two steps away from New York, it’s text is still very similar to New York throughout.
all_NV1861 <- data_frame(borrower_code = "NV1861",
borrower_section = all_matches %>%
filter(borrower_code == "NV1861") %>%
`$`("borrower_section") %>%
unique() %>%
sort()
)
matches_to_NY <- all_matches %>%
filter(borrower_code == "NV1861",
str_detect(match_code, "NY"),
score >= 0.1,
match_year <= 1861) %>%
group_by(borrower_section) %>%
arrange(desc(score)) %>%
slice(1) %>%
ungroup()
NV1861toNY <- all_NV1861 %>% left_join(matches_to_NY) %>%
arrange(borrower_section)
## Joining by: c("borrower_code", "borrower_section")
spectrogram("NV1861", NV1861toNY, white_list = 5,
title = "Sections in NV1861 with high similarity to NY codes")
## Selecting by n
Like the previous query for NV1861, Iowa is two steps away from New York. This time, however, the text is much less similar to New York. I only want one visualization for this, but I’m not sure if IA1851 or the revision IA1859 will be better, so can we do both?
IA1851
all_IA1851 <- data_frame(borrower_code = "IA1851",
borrower_section = all_matches %>%
filter(borrower_code == "IA1851") %>%
`$`("borrower_section") %>%
unique() %>%
sort()
)
matches_to_NY <- all_matches %>%
filter(borrower_code == "IA1851",
str_detect(match_code, "NY"),
score >= 0.1,
match_year <= 1851) %>%
group_by(borrower_section) %>%
arrange(desc(score)) %>%
slice(1) %>%
ungroup()
IA1851toNY <- all_IA1851 %>% left_join(matches_to_NY) %>%
arrange(borrower_section)
## Joining by: c("borrower_code", "borrower_section")
spectrogram("IA1851", IA1851toNY, white_list = 10,
title = "Sections in IA1851 with matches to NY codes")
## Selecting by n
IA1859
all_IA1859 <- data_frame(borrower_code = "IA1859",
borrower_section = all_matches %>%
filter(borrower_code == "IA1859") %>%
`$`("borrower_section") %>%
unique() %>%
sort()
)
matches_to_NY <- all_matches %>%
filter(borrower_code == "IA1859",
str_detect(match_code, "NY"),
score >= 0.1,
match_year <= 1859) %>%
group_by(borrower_section) %>%
arrange(desc(score)) %>%
slice(1) %>%
ungroup()
IA1859toNY <- all_IA1859 %>% left_join(matches_to_NY) %>%
arrange(borrower_section)
## Joining by: c("borrower_code", "borrower_section")
spectrogram("IA1859", IA1859toNY, white_list = 10,
title = "Sections in IA1859 with matches to NY codes")
## Selecting by n
Spectrogram of borrowings in NC1868. Should show heavy reliance on one or another New York Code, with scattered provisions coming from elsewhere.
spectrogram("NC1868", best_matches, white_list = 8)
## Selecting by n
Spectrogram of borrowings in CO1868. Should show heavy reliance on Illinois law.
spectrogram("CO1868", best_matches, white_list = 8)
## Selecting by n
Create a network graph based on section percentages.
edges_pct <- summary_matches %>%
filter(percent_borrowed >= 0.05,
!is.na(match_code)) %>%
select(borrower_code, match_code, weight = percent_borrowed) %>%
group_by(borrower_code) %>%
top_n(2, weight)
edges_pct
## Source: local data frame [136 x 3]
## Groups: borrower_code [83]
##
## borrower_code match_code weight
## (chr) (chr) (dbl)
## 1 AK1900 OR1862 0.5937
## 2 AR1868 KY1851 0.3634
## 3 AR1868 KY1854 0.3158
## 4 AR1874 AR1868 0.6752
## 5 AR1874 KY1851 0.0818
## 6 AZ1865 CA1851 0.5476
## 7 AZ1865 CA1858 0.2590
## 8 AZ1887 CA1872 0.4313
## 9 CA1850 NY1849 0.2972
## 10 CA1850 NY1850 0.1207
## .. ... ... ...
g <- graph_from_data_frame(edges_pct, directed = TRUE)
nodes <- distances(g, to = "NY1850", algorithm = "unweighted") %>% as.data.frame() %>%
add_rownames() %>%
rename(name = rowname, distance = NY1850) %>%
mutate(color = ifelse(distance == 0, "red",
ifelse(distance == 1, "green",
ifelse(distance == 2, "yellow", "lightblue"))))
## Warning in distances(g, to = "NY1850", algorithm = "unweighted"):
## Unweighted algorithm chosen, weights ignored
nodes[nodes$name == "NY1848", "color"] <- "red"
nodes[nodes$name == "NY1849", "color"] <- "red"
nodes[nodes$name == "NY1850", "color"] <- "red"
nodes[nodes$name == "NY1851", "color"] <- "red"
g <- graph_from_data_frame(edges_pct, directed = TRUE, vertices = nodes)
V(g)$year <- V(g)$name %>% extract_date()
set.seed(4221)
g <- add_layout_(g, with_graphopt(niter = 4000, spring.length = 25), normalize())
plot_before_year <- function(x, year) {
x_before <- induced.subgraph(x, which(V(x)$year <= year))
n <- V(x)$name
n_before <- V(x_before)$name
filter <- n %in% n_before
x_before$layout <- x_before$layout[filter, ]
par(mar = c(0,0,1,0))
plot(x_before, edge.width = E(x_before)$weight * 8,
edge.arrow.size = 0.0, vertex.size = 5)
title(paste0("Codes of Civil Procedure before ", year))
}
for (i in seq(1850, 1900, 5)) {
plot_before_year(g, i)
}
Create a graph based on numbers (not percentages) of sections shared. Notice that we are keeping only code to code matches that share a certain number of sections (minimum_n), we are keeping only a certain number of matches for each code (top_matches), and we are omitting codes that aren’t part of the main network.
minimum_n <- 20
top_matches <- 2
codes_not_to_plot <- c(
# "CO868",
# "CT1879",
# "FL1847",
# "FL1892",
"GA1851",
"GA1860",
"HI1859",
"HI1897",
# "IL1866",
"LA1825",
"LA1844"
# "MS1848",
# "MS1857",
# "NY1876",
# "NY1879",
# "VA1860",
# "VA1893",
# "WV1868"
)
# codes_not_to_plot <- NULL
edges_n <- summary_matches %>%
filter(!is.na(match_code),
sections_borrowed >= minimum_n) %>%
select(borrower_code, match_code, weight = sections_borrowed) %>%
group_by(borrower_code) %>%
top_n(top_matches, weight) %>%
ungroup() %>%
mutate(weight = rescale(weight)) %>%
filter(!borrower_code %in% codes_not_to_plot,
!match_code %in% codes_not_to_plot)
edges_n
## Source: local data frame [146 x 3]
##
## borrower_code match_code weight
## (chr) (chr) (dbl)
## 1 AK1900 OR1862 0.398192771
## 2 AK1900 NY1850 0.002409639
## 3 AK1900 OR1854 0.002409639
## 4 AR1868 KY1851 0.209036145
## 5 AR1868 KY1854 0.180120482
## 6 AR1874 AR1868 0.146987952
## 7 AR1874 KY1851 0.007228916
## 8 AZ1865 CA1851 0.199397590
## 9 AZ1865 CA1858 0.087951807
## 10 AZ1887 CA1872 0.203614458
## .. ... ... ...
g_n <- graph_from_data_frame(edges_n, directed = TRUE)
node_distances <- distances(g_n, to = c("NY1848", "NY1849", "NY1850", "NY1851"),
algorithm = "unweighted") %>%
apply(1, min, na.rm = TRUE)
## Warning in distances(g_n, to = c("NY1848", "NY1849", "NY1850", "NY1851"), :
## Unweighted algorithm chosen, weights ignored
nodes_n <- data_frame(name = names(node_distances), distance = node_distances) %>%
mutate(color = ifelse(distance == 0, "red",
ifelse(distance == 1, "green",
ifelse(distance == 2, "yellow", "lightblue"))))
g_n <- graph_from_data_frame(edges_n, directed = TRUE, vertices = nodes_n)
V(g_n)$year <- V(g_n)$name %>% extract_date()
edge_size_clamp <- function(g, multiplier = 20, max_val = 6, min_val = 1) {
w <- E(g)$weight * multiplier
w[w > max_val] <- max_val
w[w < min_val] <- min_val
w
}
set.seed(4221)
g_n <- g_n %>% add_layout_(with_graphopt(niter = 4000, spring.length = 25),
normalize())
par(mar = c(0,0,1,0))
plot(g_n, edge.width = edge_size_clamp(g_n), edge.arrow.size = 0, vertex.size = 5)
title("Borrowings between codes, number of sections borrowed")
Now do a state to state network:
min_state_borrowings <- 100
top_matches <- 2
edges_states <- summary_matches %>%
mutate(borrower_date = extract_date(borrower_code),
match_date = extract_date(match_code),
borrower_state = extract_state(borrower_code),
match_state = extract_state(match_code)) %>%
filter(!is.na(match_code),
borrower_date >= match_date,
borrower_state != match_state) %>%
group_by(borrower_state, match_state) %>%
summarize(n = sum(sections_borrowed)) %>%
filter(n >= min_state_borrowings) %>%
select(borrower_state, match_state, weight = n) %>%
group_by(borrower_state) %>%
top_n(top_matches, weight) %>%
ungroup() %>%
mutate(weight = rescale(weight))
edges_states
## Source: local data frame [44 x 3]
##
## borrower_state match_state weight
## (chr) (chr) (dbl)
## 1 AK OR 0.57655039
## 2 AR KY 0.60562016
## 3 AZ CA 0.75484496
## 4 CA NY 0.69670543
## 5 CO CA 0.05910853
## 6 CO IL 0.04166667
## 7 DC IN 0.11821705
## 8 DT ND 0.22674419
## 9 DT NE 0.19476744
## 10 FL NY 0.03488372
## .. ... ... ...
g_states <- graph_from_data_frame(edges_states, directed = TRUE)
state_distances <- distances(g_states, to = "NY", algorithm = "unweighted")
## Warning in distances(g_states, to = "NY", algorithm = "unweighted"):
## Unweighted algorithm chosen, weights ignored
nodes_states <- data_frame(name = rownames(state_distances),
distance = state_distances[, 1]) %>%
mutate(color = ifelse(distance == 0, "red",
ifelse(distance == 1, "green",
ifelse(distance == 2, "yellow", "lightblue"))))
g_states <- graph_from_data_frame(edges_states, directed = TRUE,
vertices = nodes_states) %>%
decompose(min.vertices = 3) %>%
`[[`(1)
set.seed(4221)
g_states <- g_states %>% add_layout_(with_graphopt(niter = 4000,
spring.length = 25),
normalize())
par(mar = c(0,0,1,0))
plot(g_states,
edge.width = edge_size_clamp(g_n), edge.arrow.size = 0.5,
edge.arrow.mode = 1,
vertex.size = 5, vertex.label.dist = 0.85, vertex.label.degree = pi)
title("Borrowings between states, number of sections borrowed")